Kiva - overview tags¶

In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
In [2]:
import numpy as np
import pandas as pd
import cudf
import plotly.express as px
from tqdm import tqdm

tqdm.pandas()

Import raw data¶

First, read data in .jsonl file format as a pandas data frame Then store the dataframe in .parquet format for easy access later

In [3]:
# ds = cudf.read_parquet("../fulldata/kiva_2023-08-10T17-57-12.parquet")
ds = cudf.read_parquet("../fulldata/kiva_2023-08-20T16-16-43.parquet")
In [4]:
ds.isna().all(axis=1).sum()
Out[4]:
24
In [5]:
ds.dropna(axis=0, how="all", inplace=True)

store the name of interesting columns for easy access

In [6]:
class COL:
    LOAN_AMOUNT = "loanAmount"
    FUNDED_AMOUNT = "loanFundraisingInfo.fundedAmount"
    RAISED_DATE = "raisedDate"
    POSTED_DATE = "fundraisingDate"
    TAGS = "tags"
    COUNTRY_NAME = "geocode.country.name"
    COUNTRY = "geocode.country.isoCode"
    REGION = "geocode.country.region"
    STATE = "geocode.state"
    LAT = "geocode.latitude"
    LONG = "geocode.longitude"
    SPEED = "collection_speed"
In [7]:
ds.columns
Out[7]:
Index(['anonymizationLevel', 'borrowerCount', 'borrowers', 'dafEligible',
       'delinquent', 'description', 'descriptionInOriginalLanguage',
       'disbursalDate', 'distributionModel', 'fundraisingDate', 'gender',
       'hasCurrencyExchangeLossLenders', 'id', 'isMatchable', 'inPfp',
       'loanAmount', 'lenderRepaymentTerm', 'matcherAccountId', 'matcherName',
       'matchRatio', 'matchingText', 'name', 'minNoteSize', 'paidAmount',
       'pfpMinLenders', 'plannedExpirationDate', 'previousLoanId',
       'raisedDate', 'researchScore', 'repaymentInterval', 'status', 'tags',
       'use', 'video', 'whySpecial', 'activity.id', 'activity.name',
       'endorser.id', 'endorser.image.id', 'endorser.image.url',
       'endorser.inviteeCount', 'endorser.lenderPage.city',
       'endorser.lenderPage.state', 'endorser.lenderPage.country.name',
       'endorser.lenderPage.country.isoCode',
       'endorser.lenderPage.country.region', 'endorser.lenderPage.country.ppp',
       'endorser.lenderPage.country.numLoansFundraising',
       'endorser.lenderPage.country.fundsLentInCountry',
       'endorser.lenderPage.loanBecause', 'endorser.lenderPage.occupation',
       'endorser.lenderPage.otherInfo', 'endorser.lenderPage.url',
       'endorser.lenderPage.whereabouts', 'endorser.loanCount',
       'endorser.memberSince', 'endorser.name', 'endorser.publicId',
       'geocode.city', 'geocode.state', 'geocode.country.name',
       'geocode.country.isoCode', 'geocode.country.region',
       'geocode.country.ppp', 'geocode.country.numLoansFundraising',
       'geocode.country.fundsLentInCountry', 'geocode.postalCode',
       'geocode.latitude', 'geocode.longitude', 'image.id', 'image.url',
       'loanFundraisingInfo.fundedAmount',
       'loanFundraisingInfo.isExpiringSoon',
       'loanFundraisingInfo.reservedAmount', 'originalLanguage.id',
       'originalLanguage.isActive', 'originalLanguage.isoCode',
       'originalLanguage.name', 'sector.id', 'sector.name', 'terms.currency',
       'terms.currencyFullName', 'terms.disbursalAmount',
       'terms.disbursalDate', 'terms.expectedPayments', 'terms.loanAmount',
       'terms.lenderRepaymentTerm', 'terms.lossLiabilityCurrencyExchange',
       'terms.lossLiabilityNonpayment', 'terms.flexibleFundraisingEnabled',
       'userProperties.favorited', 'userProperties.lentTo',
       'userProperties.subscribed', 'userProperties.promoEligible',
       'userProperties.amountInBasket', 'endorser.lenderPage.country',
       'endorser', 'video.thumbnailImageId', 'video.youtubeId'],
      dtype='object')
In [8]:
ds.index.duplicated().sum()
Out[8]:
array(0)
In [9]:
ds = ds[
    [
        COL.LOAN_AMOUNT,
        COL.FUNDED_AMOUNT,
        COL.RAISED_DATE,
        COL.POSTED_DATE,
        # "disbursalDate",
        COL.COUNTRY_NAME,
        COL.COUNTRY,
        COL.STATE,
        COL.REGION,
        COL.LAT,
        COL.LONG,
        COL.TAGS,
    ]
]

ds.tail()
Out[9]:
loanAmount loanFundraisingInfo.fundedAmount raisedDate fundraisingDate geocode.country.name geocode.country.isoCode geocode.state geocode.country.region geocode.latitude geocode.longitude tags
2564682 600.00 600.00 2006-07-29T15:12:14Z 2006-07-27T18:06:53Z Honduras HN El Paraiso Central America 14.083333 -86.500000 []
2564683 650.00 650.00 2006-07-29T19:53:13Z 2006-07-27T17:00:10Z Honduras HN El Paraiso Central America 14.033333 -86.583333 []
2564684 150.00 150.00 2006-07-27T13:41:46Z 2006-07-27T06:02:07Z Kenya KE Kiambu Africa -1.166667 36.833333 []
2564685 225.00 225.00 2006-07-28T12:54:18Z 2006-07-27T01:24:27Z Kenya KE Kiambu Africa -1.166667 36.833333 []
2564686 150.00 150.00 2006-07-27T04:44:24Z 2006-07-27T01:23:56Z Kenya KE Kiambu Africa -1.166667 36.833333 []
In [10]:
ds[COL.LOAN_AMOUNT] = ds[COL.LOAN_AMOUNT].astype("float32")
ds[COL.FUNDED_AMOUNT] = ds[COL.FUNDED_AMOUNT].astype("float32")
ds[COL.RAISED_DATE] = cudf.to_datetime(ds[COL.RAISED_DATE], format="%Y-%m-%dT%H:%M:%SZ")
ds[COL.POSTED_DATE] = cudf.to_datetime(ds[COL.POSTED_DATE], format="%Y-%m-%dT%H:%M:%SZ")
ds[COL.COUNTRY] = ds[COL.COUNTRY].astype("category")
ds[COL.COUNTRY_NAME] = ds[COL.COUNTRY_NAME].astype("category")
ds[COL.REGION] = ds[COL.REGION].astype("category")
ds[COL.STATE] = ds[COL.STATE].astype("category")
ds[COL.LAT] = ds[COL.LAT].astype("float32")
ds[COL.LONG] = ds[COL.LONG].astype("float32")
In [11]:
ds.info()
<class 'cudf.core.dataframe.DataFrame'>
Int64Index: 2564663 entries, 0 to 2564686
Data columns (total 11 columns):
 #   Column                            Dtype
---  ------                            -----
 0   loanAmount                        float32
 1   loanFundraisingInfo.fundedAmount  float32
 2   raisedDate                        datetime64[ns]
 3   fundraisingDate                   datetime64[ns]
 4   geocode.country.name              category
 5   geocode.country.isoCode           category
 6   geocode.state                     category
 7   geocode.country.region            category
 8   geocode.latitude                  float32
 9   geocode.longitude                 float32
 10  tags                              list
dtypes: category(4), datetime64[ns](2), float32(4), list(1)
memory usage: 153.8 MB
In [12]:
ds.index.duplicated().sum()
Out[12]:
array(0)

Preprocessing¶

We keep only the success loans¶

In [13]:
success = ds[COL.LOAN_AMOUNT] == ds[COL.FUNDED_AMOUNT]
counts = success.value_counts()
counts[True] / (counts[True] + counts[False]), len(ds)
Out[13]:
(0.9564589967570788, 2564663)
In [14]:
# keep success only
ds = ds[success]

Drop some NaN¶

In [15]:
ds.isna().sum()
Out[15]:
loanAmount                               0
loanFundraisingInfo.fundedAmount         0
raisedDate                               2
fundraisingDate                          0
geocode.country.name                     0
geocode.country.isoCode                  0
geocode.state                       300254
geocode.country.region                   0
geocode.latitude                    244728
geocode.longitude                   244728
tags                                     0
dtype: int64
In [16]:
ds.dropna(subset=[COL.LOAN_AMOUNT, COL.FUNDED_AMOUNT, COL.POSTED_DATE, COL.RAISED_DATE], inplace=True)

Collection Speed¶

In [17]:
ds["funding_duration"] = ds[COL.RAISED_DATE] - ds[COL.POSTED_DATE]
ds["funding_duration_days"] = ds["funding_duration"].astype("int64") / pow(10, 9)  # seconds
ds["funding_duration_days"] = ds["funding_duration_days"] / (24 * 60 * 60)
ds[COL.SPEED] = ds[COL.FUNDED_AMOUNT] / ds["funding_duration_days"]
ds.head()
Out[17]:
loanAmount loanFundraisingInfo.fundedAmount raisedDate fundraisingDate geocode.country.name geocode.country.isoCode geocode.state geocode.country.region geocode.latitude geocode.longitude tags funding_duration funding_duration_days collection_speed
24 75.0 75.0 2023-08-20 15:49:49 2023-08-20 15:30:50 Philippines PH Eastern Visayas Asia 10.165355 124.84034 [#Parent] 0 days 00:18:59 0.013183 5689.201054
28 100.0 100.0 2023-08-20 16:06:26 2023-08-20 15:30:49 Philippines PH Eastern Visayas Asia 10.165355 124.84034 [volunteer_pick, volunteer_like] 0 days 00:35:37 0.024734 4043.051006
47 100.0 100.0 2023-08-20 15:49:49 2023-08-20 15:10:11 Philippines PH Eastern Visayas Asia 10.165355 124.84034 [volunteer_pick, volunteer_like] 0 days 00:39:38 0.027523 3633.305299
61 100.0 100.0 2023-08-20 15:54:58 2023-08-20 14:50:11 Philippines PH Eastern Visayas Asia 10.165355 124.84034 [#Health and Sanitation] 0 days 01:04:47 0.044988 2222.793928
63 100.0 100.0 2023-08-20 15:34:06 2023-08-20 14:50:11 Philippines PH Eastern Visayas Asia 10.165355 124.84034 [] 0 days 00:43:55 0.030498 3278.937381

There are some projects which are fullfilled before being published.
Let's show them and then get rid of them

In [18]:
# some project is already fulfilled before publish
ds[ds[COL.SPEED] < 0]
Out[18]:
loanAmount loanFundraisingInfo.fundedAmount raisedDate fundraisingDate geocode.country.name geocode.country.isoCode geocode.state geocode.country.region geocode.latitude geocode.longitude tags funding_duration funding_duration_days collection_speed
866326 125.0 125.0 2015-08-26 15:27:35 2015-08-26 18:59:15 Kenya KE Africa 1.0 38.0 [] -1 days +20:28:20 -0.146991 -850.393701
870336 125.0 125.0 2015-08-13 20:58:07 2015-08-13 21:17:46 Kenya KE Africa 1.0 38.0 [] -1 days +23:40:21 -0.013646 -9160.305344
875658 125.0 125.0 2015-07-29 14:35:38 2015-07-29 17:13:45 Kenya KE Africa 1.0 38.0 [] -1 days +21:21:53 -0.109803 -1138.399916
885849 125.0 125.0 2015-06-23 09:52:47 2015-06-23 14:49:13 Kenya KE Africa 1.0 38.0 [] -1 days +19:03:34 -0.205856 -607.219161
894007 125.0 125.0 2015-05-22 03:42:29 2015-05-22 04:32:53 Kenya KE Africa 1.0 38.0 [] -1 days +23:09:36 -0.035000 -3571.428571
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2564082 500.0 500.0 2005-03-31 06:27:55 2005-04-15 17:00:00 Uganda UG Eastern Region Africa 0.75 34.08333206 [] -16 days +13:27:55 -15.438947 -32.385629
2564083 300.0 300.0 2005-03-31 06:27:55 2005-04-15 17:00:00 Uganda UG Eastern Region Africa 0.75 34.08333206 [user_favorite] -16 days +13:27:55 -15.438947 -19.431377
2564084 500.0 500.0 2005-03-31 06:27:55 2005-04-15 17:00:00 Uganda UG Eastern Region Africa 0.75 34.08333206 [user_favorite] -16 days +13:27:55 -15.438947 -32.385629
2564085 300.0 300.0 2005-03-31 06:27:55 2005-04-15 17:00:00 Uganda UG Eastern Region Africa 0.75 34.08333206 [user_favorite] -16 days +13:27:55 -15.438947 -19.431377
2564086 500.0 500.0 2005-03-31 06:27:55 2005-04-15 17:00:00 Uganda UG Eastern Region Africa 0.75 34.08333206 [user_favorite] -16 days +13:27:55 -15.438947 -32.385629

346 rows × 14 columns

In [19]:
ds = ds[ds[COL.SPEED] >= 0]

Encode tags using MultiLabelBinarizer¶

In [20]:
tagdf = ds[["tags"]].to_pandas()
tagdf.head()
Out[20]:
tags
24 [#Parent]
28 [volunteer_pick, volunteer_like]
47 [volunteer_pick, volunteer_like]
61 [#Health and Sanitation]
63 []
In [21]:
from sklearn.preprocessing import MultiLabelBinarizer

lb = MultiLabelBinarizer()

mlb = lb.fit_transform(tagdf["tags"])
mlb.shape
Out[21]:
(2452647, 65)
In [22]:
tags_columns = ["tag_" + i for i in lb.classes_]
tag_ds = cudf.DataFrame(mlb, columns=tags_columns, dtype="int8", index=tagdf.index)
del tagdf
tag_ds.sum()
Out[22]:
tag_                                     9
tag_#Agriculture                        17
tag_#Animals                        172147
tag_#BIPOC-owned Business             6686
tag_#Biz Durable Asset              104948
                                     ...  
tag_reserved_crisis_support_loan        29
tag_user_favorite                   695468
tag_user_like                            1
tag_volunteer_like                   42151
tag_volunteer_pick                   75913
Length: 65, dtype: int64
In [23]:
tag_ds.columns
Out[23]:
Index(['tag_', 'tag_#Agriculture', 'tag_#Animals', 'tag_#BIPOC-owned Business',
       'tag_#Biz Durable Asset', 'tag_#COVID-19', 'tag_#CommunityImpact',
       'tag_#Eco-friendly', 'tag_#EcoFriendly', 'tag_#Education',
       'tag_#Elderly', 'tag_#Fabrics', 'tag_#Female Education',
       'tag_#First Loan', 'tag_#GenderEquity', 'tag_#Health and Sanitation',
       'tag_#Hidden Gem', 'tag_#Inspiring Story', 'tag_#Interesting Photo',
       'tag_#Job Creator', 'tag_#Kaiser',
       'tag_#Latinx/Hispanic-Owned Business', 'tag_#Low-profit FP',
       'tag_#Married', 'tag_#NewBusiness', 'tag_#Orphan', 'tag_#Parent',
       'tag_#Post-disbursed', 'tag_#Powerful Story', 'tag_#Refugee',
       'tag_#Repair Renew Replace', 'tag_#Repeat Borrower', 'tag_#Schooling',
       'tag_#Single', 'tag_#Single Parent', 'tag_#StandoutBackstory',
       'tag_#Supporting Family', 'tag_#Sustainable Ag',
       'tag_#TangibleProducts', 'tag_#Team Guys Holding Fish',
       'tag_#Technology', 'tag_#Tourism', 'tag_#Trees',
       'tag_#US Black-Owned Business', 'tag_#US Environmental Loan',
       'tag_#US immigrant', 'tag_#Umpqua', 'tag_#Unique', 'tag_#Vegan',
       'tag_#Widowed', 'tag_#Woman-Owned Business', 'tag_GoDaddy', 'tag_LGBTQ',
       'tag_LISCChicago', 'tag_MUFG', 'tag_Salesforce', 'tag_US Refugee',
       'tag_Viral', 'tag_beauty', 'tag_cow',
       'tag_reserved_crisis_support_loan', 'tag_user_favorite',
       'tag_user_like', 'tag_volunteer_like', 'tag_volunteer_pick'],
      dtype='object')
In [24]:
# join with the original df
ds = ds.join(tag_ds)
del tag_ds
ds.head()
Out[24]:
loanAmount loanFundraisingInfo.fundedAmount raisedDate fundraisingDate geocode.country.name geocode.country.isoCode geocode.state geocode.country.region geocode.latitude geocode.longitude ... tag_Salesforce tag_US Refugee tag_Viral tag_beauty tag_cow tag_reserved_crisis_support_loan tag_user_favorite tag_user_like tag_volunteer_like tag_volunteer_pick
26655 1725.0 1725.0 2023-06-13 12:08:22 2023-05-23 20:40:05 Vietnam VN <NA> Asia <NA> <NA> ... 0 0 0 0 0 0 1 0 0 0
26656 2150.0 2150.0 2023-05-27 22:18:41 2023-05-23 20:40:05 Vietnam VN Thanh Hoa Asia 19.97921181 106.0111237 ... 0 0 0 0 0 0 1 0 0 0
26657 2975.0 2975.0 2023-06-15 18:26:53 2023-05-23 20:40:03 Samoa WS <NA> Oceania <NA> <NA> ... 0 0 0 0 0 0 1 0 0 0
26658 700.0 700.0 2023-06-22 14:36:29 2023-05-23 20:20:26 Fiji FJ <NA> Oceania <NA> <NA> ... 0 0 0 0 0 0 0 0 0 0
26659 150.0 150.0 2023-05-23 22:06:57 2023-05-23 20:20:25 Madagascar MG Antananarivo Province Africa -19.65134239 47.32661438 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 79 columns

In [25]:
# sanity check here
tag_columns = [a for a in ds.columns if a.startswith("tag_")]

for testcase in range(50):
    sam = ds.sample(1)
    tags_list = sam["tags"].iloc[0]
    for atag in tag_columns:
        if atag.replace("tag_", "") in tags_list:
            assert sam[atag].iloc[0] == 1
        else:
            assert sam[atag].iloc[0] == 0
In [26]:
# drop the orignal `tags` columns
ds.drop(["tags"], axis=1, inplace=True)

Quickly refine tags¶

merge 'tag_#Eco-friendly' and 'tag_#EcoFriendly'¶

In [27]:
ds["tag_#EcoFriendly"] = ((ds["tag_#Eco-friendly"] + ds["tag_#EcoFriendly"]) > 0).astype("int8")
ds.drop("tag_#Eco-friendly", axis=1, inplace=True)

Keep tags visibled to users¶

Take a look at this screenshot. We notice that some tags in the dataframe are not display in the website

Alt text

These undisplayed tags are tag_user_favorite, tag_user_like, tag_volunteer_like, tag_volunteer_pick. It might because those are used internally in the kiva platform

And because those tags are not being shown to Lender, we could get rid of them here

In [28]:
# drop some meaningless tags
ds.drop(
    ["tag_", "tag_user_favorite", "tag_user_like", "tag_volunteer_like", "tag_volunteer_pick"], axis=1, inplace=True
)
In [29]:
ds.info()
<class 'cudf.core.dataframe.DataFrame'>
Int64Index: 2452647 entries, 26655 to 2564545
Data columns (total 72 columns):
 #   Column                               Dtype
---  ------                               -----
 0   loanAmount                           float32
 1   loanFundraisingInfo.fundedAmount     float32
 2   raisedDate                           datetime64[ns]
 3   fundraisingDate                      datetime64[ns]
 4   geocode.country.name                 category
 5   geocode.country.isoCode              category
 6   geocode.state                        category
 7   geocode.country.region               category
 8   geocode.latitude                     float32
 9   geocode.longitude                    float32
 10  funding_duration                     timedelta64[ns]
 11  funding_duration_days                float64
 12  collection_speed                     float64
 13  tag_#Agriculture                     int8
 14  tag_#Animals                         int8
 15  tag_#BIPOC-owned Business            int8
 16  tag_#Biz Durable Asset               int8
 17  tag_#COVID-19                        int8
 18  tag_#CommunityImpact                 int8
 19  tag_#EcoFriendly                     int8
 20  tag_#Education                       int8
 21  tag_#Elderly                         int8
 22  tag_#Fabrics                         int8
 23  tag_#Female Education                int8
 24  tag_#First Loan                      int8
 25  tag_#GenderEquity                    int8
 26  tag_#Health and Sanitation           int8
 27  tag_#Hidden Gem                      int8
 28  tag_#Inspiring Story                 int8
 29  tag_#Interesting Photo               int8
 30  tag_#Job Creator                     int8
 31  tag_#Kaiser                          int8
 32  tag_#Latinx/Hispanic-Owned Business  int8
 33  tag_#Low-profit FP                   int8
 34  tag_#Married                         int8
 35  tag_#NewBusiness                     int8
 36  tag_#Orphan                          int8
 37  tag_#Parent                          int8
 38  tag_#Post-disbursed                  int8
 39  tag_#Powerful Story                  int8
 40  tag_#Refugee                         int8
 41  tag_#Repair Renew Replace            int8
 42  tag_#Repeat Borrower                 int8
 43  tag_#Schooling                       int8
 44  tag_#Single                          int8
 45  tag_#Single Parent                   int8
 46  tag_#StandoutBackstory               int8
 47  tag_#Supporting Family               int8
 48  tag_#Sustainable Ag                  int8
 49  tag_#TangibleProducts                int8
 50  tag_#Team Guys Holding Fish          int8
 51  tag_#Technology                      int8
 52  tag_#Tourism                         int8
 53  tag_#Trees                           int8
 54  tag_#US Black-Owned Business         int8
 55  tag_#US Environmental Loan           int8
 56  tag_#US immigrant                    int8
 57  tag_#Umpqua                          int8
 58  tag_#Unique                          int8
 59  tag_#Vegan                           int8
 60  tag_#Widowed                         int8
 61  tag_#Woman-Owned Business            int8
 62  tag_GoDaddy                          int8
 63  tag_LGBTQ                            int8
 64  tag_LISCChicago                      int8
 65  tag_MUFG                             int8
 66  tag_Salesforce                       int8
 67  tag_US Refugee                       int8
 68  tag_Viral                            int8
 69  tag_beauty                           int8
 70  tag_cow                              int8
 71  tag_reserved_crisis_support_loan     int8
dtypes: category(4), datetime64[ns](2), float32(4), float64(2), int8(59), timedelta64[ns](1)
memory usage: 300.3 MB
In [30]:
ds.isna().sum().sort_values()
Out[30]:
loanAmount                               0
loanFundraisingInfo.fundedAmount         0
raisedDate                               0
fundraisingDate                          0
geocode.country.name                     0
                                     ...  
tag_cow                                  0
tag_reserved_crisis_support_loan         0
geocode.latitude                    244711
geocode.longitude                   244711
geocode.state                       300236
Length: 72, dtype: int64

Now drawing¶

Tags vs time¶

In [31]:
time_df = ds[[COL.POSTED_DATE, COL.RAISED_DATE]].to_pandas()
time_df["date"] = time_df.progress_apply(
    lambda row: list(pd.date_range(row[COL.POSTED_DATE], row[COL.RAISED_DATE])), axis=1
)
time_ds = cudf.from_pandas(time_df)
del time_df
time_ds.head()
100%|██████████| 2452647/2452647 [06:42<00:00, 6093.39it/s]
Out[31]:
fundraisingDate raisedDate date
26655 2023-05-23 20:40:05 2023-06-13 12:08:22 [2023-05-23T20:40:05.000000, 2023-05-24T20:40:...
26656 2023-05-23 20:40:05 2023-05-27 22:18:41 [2023-05-23T20:40:05.000000, 2023-05-24T20:40:...
26657 2023-05-23 20:40:03 2023-06-15 18:26:53 [2023-05-23T20:40:03.000000, 2023-05-24T20:40:...
26658 2023-05-23 20:20:26 2023-06-22 14:36:29 [2023-05-23T20:20:26.000000, 2023-05-24T20:20:...
26659 2023-05-23 20:20:25 2023-05-23 22:06:57 [2023-05-23T20:20:25.000000]
In [32]:
time_ds = ds.merge(time_ds, left_index=True, right_index=True)
time_ds.head()
Out[32]:
loanAmount loanFundraisingInfo.fundedAmount raisedDate_x fundraisingDate_x geocode.country.name geocode.country.isoCode geocode.state geocode.country.region geocode.latitude geocode.longitude ... tag_MUFG tag_Salesforce tag_US Refugee tag_Viral tag_beauty tag_cow tag_reserved_crisis_support_loan fundraisingDate_y raisedDate_y date
4630 250.0 250.0 2023-08-04 01:36:00 2023-08-02 02:20:07 Sierra Leone SL <NA> Africa <NA> <NA> ... 0 0 0 0 0 0 0 2023-08-02 02:20:07 2023-08-04 01:36:00 [2023-08-02T02:20:07.000000, 2023-08-03T02:20:...
4631 4150.0 4150.0 2023-08-12 16:24:47 2023-08-02 02:20:04 Paraguay PY Cordillera South America -25.38333321 -57.15000153 ... 0 0 0 0 0 0 0 2023-08-02 02:20:04 2023-08-12 16:24:47 [2023-08-02T02:20:04.000000, 2023-08-03T02:20:...
4632 1000.0 1000.0 2023-08-20 01:23:09 2023-08-02 02:00:11 Cambodia KH Siem Reap Asia 13.36666679 103.8499985 ... 0 0 0 0 0 0 0 2023-08-02 02:00:11 2023-08-20 01:23:09 [2023-08-02T02:00:11.000000, 2023-08-03T02:00:...
4634 200.0 200.0 2023-08-02 07:09:51 2023-08-02 02:00:10 Indonesia ID Banten Asia -6.748270512 105.688179 ... 0 0 0 0 0 0 0 2023-08-02 02:00:10 2023-08-02 07:09:51 [2023-08-02T02:00:10.000000]
4636 175.0 175.0 2023-08-02 10:57:40 2023-08-02 02:00:09 Indonesia ID Banten Asia -6.178055763 106.6299973 ... 0 0 0 0 0 0 0 2023-08-02 02:00:09 2023-08-02 10:57:40 [2023-08-02T02:00:09.000000]

5 rows × 75 columns

In [33]:
time_ds = time_ds.explode("date")
time_ds["date"] = time_ds["date"].dt.floor("D")
time_ds.head()
Out[33]:
loanAmount loanFundraisingInfo.fundedAmount raisedDate_x fundraisingDate_x geocode.country.name geocode.country.isoCode geocode.state geocode.country.region geocode.latitude geocode.longitude ... tag_MUFG tag_Salesforce tag_US Refugee tag_Viral tag_beauty tag_cow tag_reserved_crisis_support_loan fundraisingDate_y raisedDate_y date
4630 250.0 250.0 2023-08-04 01:36:00 2023-08-02 02:20:07 Sierra Leone SL <NA> Africa <NA> <NA> ... 0 0 0 0 0 0 0 2023-08-02 02:20:07 2023-08-04 01:36:00 2023-08-02
4630 250.0 250.0 2023-08-04 01:36:00 2023-08-02 02:20:07 Sierra Leone SL <NA> Africa <NA> <NA> ... 0 0 0 0 0 0 0 2023-08-02 02:20:07 2023-08-04 01:36:00 2023-08-03
4631 4150.0 4150.0 2023-08-12 16:24:47 2023-08-02 02:20:04 Paraguay PY Cordillera South America -25.38333321 -57.15000153 ... 0 0 0 0 0 0 0 2023-08-02 02:20:04 2023-08-12 16:24:47 2023-08-02
4631 4150.0 4150.0 2023-08-12 16:24:47 2023-08-02 02:20:04 Paraguay PY Cordillera South America -25.38333321 -57.15000153 ... 0 0 0 0 0 0 0 2023-08-02 02:20:04 2023-08-12 16:24:47 2023-08-03
4631 4150.0 4150.0 2023-08-12 16:24:47 2023-08-02 02:20:04 Paraguay PY Cordillera South America -25.38333321 -57.15000153 ... 0 0 0 0 0 0 0 2023-08-02 02:20:04 2023-08-12 16:24:47 2023-08-04

5 rows × 75 columns

In [34]:
tag_columns = [a for a in ds.columns if a.startswith("tag_")]
tag_counts = []
for atag in tqdm(tag_columns):
    temp = time_ds[time_ds[atag] == 1].date.value_counts().rename(atag)
    tag_counts.append(temp)
tag_counts_concat = cudf.concat(tag_counts, axis=1)
tag_counts_concat
100%|██████████| 59/59 [00:01<00:00, 35.69it/s]
Out[34]:
tag_#Agriculture tag_#Animals tag_#BIPOC-owned Business tag_#Biz Durable Asset tag_#COVID-19 tag_#CommunityImpact tag_#EcoFriendly tag_#Education tag_#Elderly tag_#Fabrics ... tag_GoDaddy tag_LGBTQ tag_LISCChicago tag_MUFG tag_Salesforce tag_US Refugee tag_Viral tag_beauty tag_cow tag_reserved_crisis_support_loan
2008-01-29 <NA> 1 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> ... <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
2009-07-03 <NA> 5 <NA> 1 <NA> <NA> <NA> <NA> <NA> <NA> ... <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
2009-07-04 <NA> 5 <NA> 1 <NA> <NA> <NA> <NA> <NA> <NA> ... <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
2009-07-05 <NA> 5 <NA> 1 <NA> <NA> <NA> <NA> <NA> <NA> ... <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
2009-07-06 <NA> 4 <NA> 1 <NA> <NA> <NA> <NA> <NA> <NA> ... <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2023-08-18 <NA> 170 9 70 <NA> 1 139 <NA> 181 47 ... <NA> 2 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
2023-08-19 <NA> 76 5 36 <NA> <NA> 52 <NA> 80 23 ... <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
2023-08-20 <NA> 32 4 13 <NA> <NA> 27 <NA> 40 12 ... <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
2023-08-21 <NA> 6 3 5 <NA> <NA> 8 <NA> 4 2 ... <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
2023-08-22 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> 1 1 ... <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>

4377 rows × 59 columns

In [35]:
date_range = list(pd.date_range(ds[COL.POSTED_DATE].min(), ds[COL.RAISED_DATE].max(), normalize=True))
tag_hist = cudf.DataFrame({"date": date_range})
tag_hist = tag_hist.set_index("date")
tag_hist = tag_hist.merge(tag_counts_concat, left_index=True, right_index=True, how="outer")
tag_hist_display = tag_hist.reset_index().melt(id_vars=["date"], var_name="tag")
tag_hist_display.dropna(inplace=True)
tag_hist_display = tag_hist_display.to_pandas()
tag_hist_display
Out[35]:
date tag value
1761 2022-10-06 tag_#Agriculture 1
1814 2022-08-08 tag_#Agriculture 1
1815 2022-08-09 tag_#Agriculture 1
1816 2022-08-10 tag_#Agriculture 1
2174 2023-01-23 tag_#Agriculture 1
... ... ... ...
373734 2020-06-07 tag_reserved_crisis_support_loan 13
373735 2020-06-08 tag_reserved_crisis_support_loan 13
373736 2020-06-09 tag_reserved_crisis_support_loan 13
373737 2020-06-01 tag_reserved_crisis_support_loan 12
373738 2020-06-05 tag_reserved_crisis_support_loan 13

122027 rows × 3 columns

In [36]:
fig = px.histogram(
    tag_hist_display, x="date", y="value", color="tag", barmode="overlay", opacity=0.3, histnorm="percent", height=800
)
fig.show()

from above figure, we could see that some tags are only happend in a short timeframe, e.g

  • #Married
  • Post-disbursed ???
  • Salesforce
  • beauty
  • Viral
  • MUFG
  • reversed_crisis_support_loan

Number of project vs tag¶

In [37]:
tag_columns = [a for a in ds.columns if a.startswith("tag_")]


class My:
    FIG_H = 800
    FIG_W = None
In [38]:
project_count_per_tag = ds[tag_columns].sum().sort_values(ascending=False)
project_count_per_tag = project_count_per_tag.to_pandas()
project_count_per_tag.rename("number of project", inplace=True)

fig = px.bar(
    project_count_per_tag.head(20),
    orientation="v",
    text_auto=True,
    title="Number of Projects per Tag",
    height=My.FIG_H,
    width=My.FIG_W,
    labels={"x": "Categories", "y": "Number of Loans"},
)
fig.update_traces(showlegend=False)
fig.update_layout(xaxis_title=None, yaxis_title="Number of Projects")
fig.show()

Number of project vs country¶

In [39]:
proj_per_country = ds.groupby(by=["geocode.country.name"]).count()["loanAmount"].sort_values(ascending=False)
proj_per_country = proj_per_country.to_pandas()
proj_per_country.rename("number of project", inplace=True)

fig = px.bar(
    proj_per_country.head(20),
    orientation="v",
    text_auto=True,
    title="Number of Projects per Country",
    height=My.FIG_H,
    width=My.FIG_W,
    labels={"x": "Categories", "y": "Values"},
)
fig.update_traces(showlegend=False)
fig.update_layout(xaxis_title="Country", yaxis_title="Number of Projects")
fig.show()

Collection Speed vs Tag¶

In [42]:
def get_tag_performance(_df: pd.DataFrame, num_tag: int = 10) -> pd.DataFrame:
    """get speed performance by tags, keep only first `num_tag`"""
    tags_performances = []

    for atag in tag_columns:
        mean = _df[_df[atag] == 1][COL.SPEED].mean()
        std = _df[_df[atag] == 1][COL.SPEED].std()
        count = _df[_df[atag] == 1][COL.SPEED].count()
        tags_performances.append({"tag": atag, "speed_mean": mean, "speed_std": std, "count": count})

    tags_performances = pd.DataFrame(tags_performances)
    tags_performances.dropna(subset=["speed_mean"], inplace=True)
    tags_performances.fillna(0, inplace=True)
    tags_performances.sort_values("speed_mean", inplace=True, ascending=False)
    tags_performances.set_index("tag", inplace=True)
    return tags_performances.head(num_tag)


# fig = px.bar(tags_performances, y="tag", x="mean", error_x="std", text_auto=True)
fig = px.bar(
    get_tag_performance(ds, 20),
    y="speed_mean",
    text_auto=True,
    title="Collection Speed Mean per Tag",
    width=My.FIG_W,
    height=My.FIG_H,
)
fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
fig.show()
In [43]:
fig = px.bar(
    get_tag_performance(ds, 20),
    y="speed_mean",
    text_auto=True,
    title="Collection Speed Mean per Tag",
    width=My.FIG_W,
    height=My.FIG_H,
)
fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
fig.show()

Colection speed vs Tags for Countries¶

In [44]:
"""convert coutry code to country name"""
code_to_name = ds[[COL.COUNTRY_NAME, COL.COUNTRY]].drop_duplicates()
code_to_name.set_index(COL.COUNTRY, inplace=True)
code_to_name = code_to_name.to_dict()[COL.COUNTRY_NAME]
assert code_to_name["VN"] == "Vietnam"
In [47]:
country_code = "VN"
vn_df = ds[ds[COL.COUNTRY] == country_code]
fig = px.bar(
    get_tag_performance(vn_df),
    y="speed_mean",
    text_auto=True,
    title=f"Mean Collection Speed for {code_to_name[country_code]}",
    width=My.FIG_W,
    height=My.FIG_H,
)
fig.update_traces(textfont_size=26, textangle=0, textposition="outside", cliponaxis=False)
fig.show()
In [48]:
country_code = "KE"
vn_df = ds[ds[COL.COUNTRY] == country_code]
fig = px.bar(
    get_tag_performance(vn_df),
    y="speed_mean",
    text_auto=True,
    title=f"Mean Collection Speed for {code_to_name[country_code]}",
    width=My.FIG_W,
    height=My.FIG_H,
)
fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
fig.show()
In [49]:
country_code = "KH"
vn_df = ds[ds[COL.COUNTRY] == country_code]
fig = px.bar(
    get_tag_performance(vn_df),
    y="speed_mean",
    text_auto=True,
    title=f"Mean Collection Speed for {code_to_name[country_code]}",
    width=My.FIG_W,
    height=My.FIG_H,
)
fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
fig.show()
In [50]:
country_code = "PK"
vn_df = ds[ds[COL.COUNTRY] == country_code]
fig = px.bar(
    get_tag_performance(vn_df),
    y="speed_mean",
    text_auto=True,
    title=f"Mean Collection Speed for {code_to_name[country_code]}",
    width=My.FIG_W,
    height=My.FIG_H,
)
fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
fig.show()
In [51]:
country_code = "SV"
vn_df = ds[ds[COL.COUNTRY] == country_code]
fig = px.bar(
    get_tag_performance(vn_df),
    y="speed_mean",
    text_auto=True,
    title=f"Mean Collection Speed for {code_to_name[country_code]}",
    width=My.FIG_W,
    height=My.FIG_H,
)
fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
fig.show()

Correlation between tags and collection speed

In [52]:
tag_df = ds[tag_columns].to_pandas()
corr = tag_df.corrwith(ds[COL.SPEED].to_pandas(), method="kendall", drop=True)
corr.rename("correlation", inplace=True)
fig = px.bar(
    corr.sort_values(ascending=False),
    orientation="h",
    text_auto=True,
    title="Correlation betwene Tags and Collection Speed",
    height=My.FIG_H,
)
fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
fig.update_layout(xaxis_title="kendall correlation score", yaxis_title=None)
fig.update_traces(showlegend=False)
fig.show()

Influence of the number of tags¶

Above-average speed vs number of tags

In [53]:
tag_count = ds[tag_columns].sum(axis=1)
tag_count_df = tag_count.value_counts()
tag_count_mean = tag_count.mean()
tag_count_std = tag_count.std()
fig = px.bar(
    tag_count_df.to_pandas(),
    text_auto=True,
    title="Distribution of Number of Tags per Project",
    width=My.FIG_W,
    height=My.FIG_H,
)
fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
fig.update_xaxes(tickmode="linear")
fig.update_layout(xaxis_title="Number of Tags", yaxis_title="Number of Project")
fig.update_traces(showlegend=False)
fig.add_vline(x=tag_count_mean)
fig.show()
In [54]:
# Number of above-average speed collection vs number of tags
speed_mean = ds[COL.SPEED].mean()
is_above_average = ds[COL.SPEED] >= speed_mean
In [55]:
tag_count_df = tag_count.to_frame(name="tag_count").join(is_above_average.rename("is_above_average"))
In [56]:
mn = tag_count_df.groupby("tag_count").agg(["sum", "count"])
In [57]:
mn["percentage"] = mn["is_above_average"]["sum"] / mn["is_above_average"]["count"] * 100
In [58]:
fig = px.bar(
    mn["percentage"],
    text_auto=True,
    title="Projects with above-average collection speed based on the number of tags",
    labels={"value": "Percentage of Project with Collection Speed above Global Average"},
    height=My.FIG_H,
)
# fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
fig.update_traces(showlegend=False)
fig.update_xaxes(tickmode="linear")
fig.show()

Effectiveness of Tags on Top 5% Collection Speed

In [ ]:
 

Pair of tags Performance¶

In [59]:
from itertools import combinations
In [60]:
tag2_columns = []
for apair in combinations(tag_columns, 2):
    col_name = "__".join(apair)
    tag2_columns.append(col_name)
    ds[col_name] = ds[apair[0]] & ds[apair[1]]
assert ds[tag2_columns].max().max() == 1
assert ds[tag2_columns].min().min() == 0
In [61]:
speed_mean = ds[COL.SPEED].mean()
above_average = ds[ds[COL.SPEED] >= speed_mean]
In [62]:
tag2_performances = []

for apair in tqdm(tag2_columns):
    temp = above_average[apair] * above_average[COL.SPEED]
    mean = temp.mean()
    std = temp.std()
    tag2_performances.append({"tag": apair, "mean": mean, "std": std})
100%|██████████| 1711/1711 [00:01<00:00, 922.80it/s]
In [63]:
la = pd.DataFrame(tag2_performances)
la = la.dropna(subset=["mean"]).sort_values("mean", ascending=True)
la = la.tail(20)
la
Out[63]:
tag mean std
451 tag_#Elderly__tag_#Parent 61.412328 5738.995627
1123 tag_#Parent__tag_#Single Parent 61.426469 2442.729082
1375 tag_#Single Parent__tag_#Woman-Owned Business 65.372891 2354.055291
1645 tag_#Widowed__tag_#Woman-Owned Business 72.374097 2441.126271
1322 tag_#Schooling__tag_#Woman-Owned Business 73.109297 4022.540638
1634 tag_#Vegan__tag_#Woman-Owned Business 74.000523 5939.475894
1276 tag_#Repeat Borrower__tag_#Schooling 80.544568 3847.419643
80 tag_#Animals__tag_#Parent 101.176272 2248.550738
364 tag_#EcoFriendly__tag_#Technology 103.516060 1141.510431
524 tag_#Fabrics__tag_#Woman-Owned Business 109.132849 3779.977944
339 tag_#EcoFriendly__tag_#Health and Sanitation 109.789913 1069.773658
64 tag_#Animals__tag_#Elderly 110.556436 3086.292794
1121 tag_#Parent__tag_#Schooling 114.049577 4360.514412
104 tag_#Animals__tag_#Woman-Owned Business 234.278497 4059.329960
85 tag_#Animals__tag_#Repeat Borrower 234.753517 4337.145020
1120 tag_#Parent__tag_#Repeat Borrower 353.527412 8256.977832
456 tag_#Elderly__tag_#Repeat Borrower 401.836528 9297.254081
475 tag_#Elderly__tag_#Woman-Owned Business 473.375115 8582.458403
1139 tag_#Parent__tag_#Woman-Owned Business 738.201803 10952.096874
1294 tag_#Repeat Borrower__tag_#Woman-Owned Business 888.668345 11548.088531
In [64]:
fig = px.bar(la, y="tag", x="mean", title="Effectiveness of Tags on Collection Speed", width=My.FIG_W, height=My.FIG_H)
# fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
# fig.update_xaxes(tickmode='linear')
# fig.add_vline(x=tag_count_mean)
fig.show()

Save the results¶

In [66]:
# convert to html to easiy read
!jupyter nbconvert --to html 1_data_overview.ipynb
[NbConvertApp] Converting notebook 1_data_overview.ipynb to html
[NbConvertApp] Writing 390204 bytes to 1_data_overview.html